# PREDICT DISTRIBUTION OF JEWS BY COUNTY
# Author: Kasia Nalewajko

rm(list = ls())

library(dplyr); library(sf); library(haven); library(MASS)


load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/main_dept.Rda")
load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/main.Rda")


for_estimation <- main %>%
  dplyr::select(dept_code, deppct, klarsfeld_deported_sum, number_jews_above_15_dep, nsynagogues, spector_jews, pop1936, railway_km_1922, rug_sd, frelev_mean) %>%
  unique()

for_estimation$spector_jews <- ifelse(is.na(for_estimation$spector_jews), 0, for_estimation$spector_jews) # if no information about Jews' residence in Spector, assume that according to Spector there were no significant Jewish populations there

# add Insee population stats

# county

load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/population_summaries_deppct.Rda")

for_estimation <- left_join(for_estimation, population_summaries_deppct, by = "deppct")
for_estimation$pop1936.x <- NULL
for_estimation <- for_estimation %>%
  rename(pop1936 = pop1936.y) %>%
  mutate(pop_diff_1936_21 = pop1936-pop1921,
         pop_diff_1936_31 = pop1936-pop1931)
for_estimation$pop_diff_1936_21 <- ifelse(is.na(for_estimation$pop_diff_1936_21), 0, for_estimation$pop_diff_1936_21)
for_estimation$pop_diff_1936_31 <- ifelse(is.na(for_estimation$pop_diff_1936_31), 0, for_estimation$pop_diff_1936_31)

# province

load("./00 SUBMITTED/00 APSR final/04 replication_files/01 data/population_summaries_dep.Rda")

frdep <- left_join(frdep, population_summaries_dep, by = c("dept_code" = "dep"))
frdep$pop1911.x <- NULL
frdep$pop1936.x <- NULL

frdep <- frdep %>% # summarise information about the population movements in the prewar period
  rename(pop1936 = pop1936.y,
         pop1911 = pop1911.y) %>%
  mutate(pop_diff_1936_21 = pop1936-pop1921,
         pop_diff_1936_31 = pop1936-pop1931)

# add DHI

dhi <- main %>% 
  dplyr::select(deppct, DHI_milipol_sum1940)

for_estimation <- left_join(for_estimation, dhi)

# run a negative binomial model to understand which variables are good at predicting the outcome

model <- MASS::glm.nb(formula = number_jews_above_15 ~ # Klarsfeld summary of the Nazi census (count)
                        log(prewar_jpop_spector+1) + # pre-existing capital (significant predictor**)
                        log(nsynagogues+1) + # prewar communities
                        log(pop1936+1) + # economic and social capital / danger (significant predictor*)
                        pop_diff_1936_31 + # economy  (significant predictor*)
                        railway_km_log + # access (significant predictor**)
                        frelev_mean + # access/hiding (significant predictor*)
                        log(inc_posts1940_de+inc_posts1940_fr+1) + # (significant predictor*)
                        zone5, # control for the effects of occupation zones (that are already considered in the province-level census counts) in order to isolate the effects of other variables
                      data = frdep)

summary(model)
model$coefficients

# CALCULATE WEIGHTED SCORES OF COUNTY-LEVEL PREDICTORS BASED ON SIGNIFICANT PROVINCE-LEVEL COEFFICIENTS

for_estimation$weighted_score <- 
  log(for_estimation$spector_jews+1) * exp(model$coefficients[2]) + # assign more weight to strong predictors with p-value of approx. <= 0.01 (less uncertainty)
  for_estimation$pop_diff_1936_31 * model$coefficients[5] +
  log(for_estimation$railway_km_1922+1) * exp(model$coefficients[6]) +
  for_estimation$frelev_mean * model$coefficients[7] +
  log(for_estimation$DHI_milipol_sum1940+1) * model$coefficients[8] +
  log(for_estimation$pop1936+1) * model$coefficients[4] +
  log(for_estimation$nsynagogues+1) * abs(model$coefficients[3])

# Normalize the scores

for_estimation <- for_estimation %>% 
  group_by(dept_code) %>% 
  mutate(normalized_weighted_score = scale(weighted_score),
         normalized_weighted_score = as.numeric(normalized_weighted_score))

estimated <- for_estimation %>%
  group_by(dept_code) %>%
  mutate(
    total_normalized_weighted_score = sum(normalized_weighted_score, na.rm = T),
    proportion = ifelse(total_normalized_weighted_score > 0 & normalized_weighted_score > 0, normalized_weighted_score / (total_normalized_weighted_score), 0),
    proportion = ifelse(total_normalized_weighted_score < 0 & normalized_weighted_score < 0, normalized_weighted_score / (total_normalized_weighted_score), proportion),
    proportion = ifelse(proportion == 0, abs(normalized_weighted_score) / abs(total_normalized_weighted_score), proportion),
    proportion = ifelse(proportion == Inf, abs(normalized_weighted_score) / abs(total_normalized_weighted_score+1), proportion),
    allocated_jpop = number_jews_above_15_dep * proportion,
    allocated_jpop = allocated_jpop * (number_jews_above_15_dep / sum(allocated_jpop, na.rm = T)), # make sure that the sum of Jews by province corresponds to Klarsfeld 1978 province-level summaries of the Nazi census
    allocated_jpop = ifelse(is.na(allocated_jpop), 0, allocated_jpop),
    allocated_jpop = round(allocated_jpop)
    )

# inspect the results

temp <- estimated %>% 
  group_by(dept_code) %>% 
  summarise(sum_jews = sum(allocated_jpop)) %>% 
  arrange(desc(sum_jews)) 

temp2 <- estimated %>% 
  dplyr::select(dept_code, number_jews_above_15_dep) %>% 
  unique()

temp <- left_join(temp, temp2)

# add the result to the main data frame

allocation <- estimated %>% 
  dplyr::select(deppct, allocated_jpop)

# save(allocation, file = "./00 SUBMITTED/00 APSR final/04 replication_files/01 data/allocated_jpop.Rda")
